{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "CBk3jQ3fVWUp"
      },
      "outputs": [],
      "source": [
        "# Copyright 2024 Google LLC\n",
        "#\n",
        "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
        "# you may not use this file except in compliance with the License.\n",
        "# You may obtain a copy of the License at\n",
        "#\n",
        "#     https://www.apache.org/licenses/LICENSE-2.0\n",
        "#\n",
        "# Unless required by applicable law or agreed to in writing, software\n",
        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
        "# See the License for the specific language governing permissions and\n",
        "# limitations under the License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Z6z9Ibm0VXRd"
      },
      "source": [
        "# Ingestion of Unstructured Documents with Metadata in Vertex AI Search\n",
        "\n",
        "<table align=\"left\">\n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://art-analytics.appspot.com/r.html?uaid=G-FHXEFWTT4E&utm_source=aRT-vais-building-blocks&utm_medium=aRT-clicks&utm_campaign=vais-building-blocks&destination=vais-building-blocks&url=https%3A%2F%2Fcolab.research.google.com%2Fgithub%2FGoogleCloudPlatform%2Fapplied-ai-engineering-samples%2Fblob%2Fmain%2Fsearch%2Fvais-building-blocks%2Fingesting_unstructured_documents_with_metadata.ipynb\">\n",
        "      <img width=\"32px\" src=\"https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg\" alt=\"Google Colaboratory logo\"><br> Open in Colab\n",
        "    </a>\n",
        "  </td>\n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://art-analytics.appspot.com/r.html?uaid=G-FHXEFWTT4E&utm_source=aRT-vais-building-blocks&utm_medium=aRT-clicks&utm_campaign=vais-building-blocks&destination=vais-building-blocks&url=https%3A%2F%2Fconsole.cloud.google.com%2Fvertex-ai%2Fcolab%2Fimport%2Fhttps%3A%252F%252Fraw.githubusercontent.com%252FGoogleCloudPlatform%252Fapplied-ai-engineering-samples%252Fmain%252Fsearch%252Fvais-building-blocks%252Fingesting_unstructured_documents_with_metadata.ipynb\">\n",
        "      <img width=\"32px\" src=\"https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN\" alt=\"Google Cloud Colab Enterprise logo\"><br> Open in Colab Enterprise\n",
        "    </a>\n",
        "  </td>\n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://art-analytics.appspot.com/r.html?uaid=G-FHXEFWTT4E&utm_source=aRT-vais-building-blocks&utm_medium=aRT-clicks&utm_campaign=vais-building-blocks&destination=vais-building-blocks&url=https%3A%2F%2Fconsole.cloud.google.com%2Fvertex-ai%2Fworkbench%2Fdeploy-notebook%3Fdownload_url%3Dhttps%3A%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fapplied-ai-engineering-samples%2Fmain%2Fsearch%2Fvais-building-blocks%2Fingesting_unstructured_documents_with_metadata.ipynb\">\n",
        "      <img src=\"https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg\" alt=\"Vertex AI logo\"><br> Open in Vertex AI Workbench\n",
        "    </a>\n",
        "  </td>\n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://github.com/GoogleCloudPlatform/applied-ai-engineering-samples/blob/main/search/vais-building-blocks/ingesting_unstructured_documents_with_metadata.ipynb\">\n",
        "      <img width=\"32px\" src=\"https://upload.wikimedia.org/wikipedia/commons/9/91/Octicons-mark-github.svg\" alt=\"GitHub logo\"><br> View on GitHub\n",
        "    </a>\n",
        "  </td>\n",
        "</table>\n",
        "\n",
        "<div style=\"clear: both;\"></div>\n",
        "\n",
        "<b>Share to:</b>\n",
        "\n",
        "<a href=\"https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/search/vais-building-blocks/ingesting_unstructured_documents_with_metadata.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg\" alt=\"LinkedIn logo\">\n",
        "</a>\n",
        "\n",
        "<a href=\"https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/search/vais-building-blocks/ingesting_unstructured_documents_with_metadata.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg\" alt=\"Bluesky logo\">\n",
        "</a>\n",
        "\n",
        "<a href=\"https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/search/vais-building-blocks/ingesting_unstructured_documents_with_metadata.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/5/53/X_logo_2023_original.svg\" alt=\"X logo\">\n",
        "</a>\n",
        "\n",
        "<a href=\"https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/search/vais-building-blocks/ingesting_unstructured_documents_with_metadata.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png\" alt=\"Reddit logo\">\n",
        "</a>\n",
        "\n",
        "<a href=\"https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/search/vais-building-blocks/ingesting_unstructured_documents_with_metadata.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg\" alt=\"Facebook logo\">\n",
        "</a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "rhWHRiVePfQV"
      },
      "source": [
        "| | |\n",
        "|----------|-------------|\n",
        "| Author(s)   | Hossein Mansour|\n",
        "| Reviewers(s) | Meltem Subasioglu, Rajesh Thallam|\n",
        "| Last updated | 2024-07-23: The first draft |"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "GVFVUibCCiAD"
      },
      "source": [
        "# Overview\n",
        "\n",
        "In this notebook, we will show you how to prepare and ingest unstructured documents with metadata into [Vertex AI Search](https://cloud.google.com/generative-ai-app-builder/docs/introduction). Metadata can be used for different purposes such as improving recall and precision, influencing results via boosting and filtering, and including additional context to be retrieved together with the documents. You can find more information about different types of metadata [here](https://cloud.google.com/generative-ai-app-builder/docs/provide-schema#about_providing_your_own_schema_as_a_json_object).\n",
        "\n",
        "We will perform the following steps:\n",
        "\n",
        "- Creating a Vertex AI Search Datastore\n",
        "- Creating a Vertex AI Search App\n",
        "- [Optional] Updating the Schema for the Datastore\n",
        "- Reading Documents and their Metadata from a GCS bucket and combining them together as JSONL file\n",
        "- Uploading the documents with their metadata to the Datastore\n",
        "- Searching the Datastore\n",
        "\n",
        "\n",
        "Please refer to the [official documentation](https://cloud.google.com/generative-ai-app-builder/docs/create-datastore-ingest) of Vertex AI Search for the definition of Datastores and Apps and their relationships to one another.\n",
        "\n",
        "REST API is used throughout this notebook. Please consult the [official documentation](https://cloud.google.com/generative-ai-app-builder/docs/apis) for alternative ways to achieve the same goal, namely Client libraries and RPC.\n",
        "\n",
        "\n",
        "## Vertex AI Search\n",
        "Vertex AI Search (VAIS) is a fully-managed platform, powered by large language models, that lets you build AI-enabled search and recommendation experiences for your public or private websites or mobile applications\n",
        "\n",
        "VAIS can handle a diverse set of data sources including structured, unstructured, and website data, as well as data from third-party applications such as Jira, Salesforce, and Confluence.\n",
        "\n",
        "VAIS also has built-in integration with LLMs which enables you to provide answers to complex questions, grounded in your data\n",
        "\n",
        "## Using this Notebook\n",
        "If you're running outside of Colab, depending on your environment you may need to install pip packages that are included in the Colab environment by default but are not part of the Python Standard Library. Outside of Colab you'll also notice comments in code cells that look like #@something, these trigger special Colab functionality but don't change the behavior of the notebook.\n",
        "\n",
        "This tutorial uses the following Google Cloud services and resources:\n",
        "\n",
        "- Service Usage API\n",
        "- Discovery Engine\n",
        "- Google Cloud Storage Client\n",
        "\n",
        "This notebook has been tested in the following environment:\n",
        "\n",
        "- Python version = 3.10.12\n",
        "- google.cloud.storage = 2.8.0\n",
        "- google.auth = 2.27.0\n",
        "\n",
        "# Getting Started\n",
        "\n",
        "The following steps are necessary to run this notebook, no matter what notebook environment you're using.\n",
        "\n",
        "If you're entirely new to Google Cloud, [get started here](https://cloud.google.com/docs/get-started)\n",
        "\n",
        "## Google Cloud Project Setup\n",
        "\n",
        "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs\n",
        "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project)\n",
        "3. [Enable the Service Usage API](https://console.cloud.google.com/apis/library/serviceusage.googleapis.com)\n",
        "4. [Enable the Cloud Storage API](https://console.cloud.google.com/flows/enableapi?apiid=storage.googleapis.com)\n",
        "5. [Enable the Discovery Engine API for your project](https://console.cloud.google.com/marketplace/product/google/discoveryengine.googleapis.com)\n",
        "\n",
        "## Google Cloud Permissions\n",
        "\n",
        "Ideally you should have [Owner role](https://cloud.google.com/iam/docs/understanding-roles) for your project to run this notebook. If that is not an option, you need at least the following [roles](https://cloud.google.com/iam/docs/granting-changing-revoking-access)\n",
        "- **`roles/serviceusage.serviceUsageAdmin`** to enable APIs\n",
        "- **`roles/iam.serviceAccountAdmin`** to modify service agent permissions\n",
        "- **`roles/discoveryengine.admin`** to modify discoveryengine assets\n",
        "- **`roles/storage.objectAdmin`** to modify and delete GCS buckets\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "NuQphNnDp3xA"
      },
      "source": [
        "# Setup Environment"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "YqcV8aj8GvZA"
      },
      "source": [
        "## Authentication\n",
        "\n",
        " If you're using Colab, run the code in the next cell. Follow the pop-ups and authenticate with an account that has access to your Google Cloud [project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#identifying_projects).\n",
        "\n",
        "If you're running this notebook somewhere besides Colab, make sure your environment has the right Google Cloud access. If that's a new concept to you, consider looking into [Application Default Credentials for your local environment](https://cloud.google.com/docs/authentication/provide-credentials-adc#local-dev) and [initializing the Google Cloud CLI](https://cloud.google.com/docs/authentication/gcloud). In many cases, running `gcloud auth application-default login` in a shell on the machine running the notebook kernel is sufficient.\n",
        "\n",
        "More authentication options are discussed [here](https://cloud.google.com/docs/authentication)."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "DZjtfEDG7Sr3"
      },
      "outputs": [],
      "source": [
        "# Colab authentication.\n",
        "import sys\n",
        "\n",
        "if \"google.colab\" in sys.modules:\n",
        "    from google.colab import auth\n",
        "\n",
        "    auth.authenticate_user()\n",
        "    print(\"Authenticated\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "kT3Eda7_mlTP"
      },
      "outputs": [],
      "source": [
        "from google.auth import default\n",
        "from google.auth.transport.requests import AuthorizedSession\n",
        "\n",
        "creds, _ = default()\n",
        "authed_session = AuthorizedSession(creds)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "_OyCUmMVGeo-"
      },
      "source": [
        "## Import Libraries"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "gCIgR1NCatrP"
      },
      "outputs": [],
      "source": [
        "import glob\n",
        "import json\n",
        "import os\n",
        "import re\n",
        "import shutil\n",
        "import time\n",
        "from typing import Any\n",
        "from urllib.parse import urlparse\n",
        "\n",
        "from google.cloud import storage\n",
        "import pandas as pd\n",
        "import requests"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "KTSL1m_CHFBI"
      },
      "source": [
        "## Configure environment\n",
        "\n",
        "You can enter the ID for an existing App and Datastore to be used in this notebook. Alternatively, you can enter the desired IDs for non-existings App and Datastore and they will be created later in this notebook.\n",
        "\n",
        "Same applies to the GCS Directory of Documents and Metadata. The Documents and Metadata can be in separate buckets, but it is advised to keep them (together with the JSONL created later in this notebook) in the same temporary bucket for the ease of cleanup.\n",
        "\n",
        "You can find more information regarding the \"Location\" of datastores and associated limitations [here](https://cloud.google.com/generative-ai-app-builder/docs/locations#specify_a_multi-region_for_your_data_store). The Location of a Datastore is set at the time of creation and it should be called appropriately to query the Datastore."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "089_6PdMa64e"
      },
      "outputs": [],
      "source": [
        "PROJECT_ID = \"\"  # @param {type:\"string\"}\n",
        "\n",
        "# Vertex AI Search Parameters\n",
        "DATASTORE_ID = \"\"  # @param {type:\"string\"}\n",
        "APP_ID = \"\"  # @param {type:\"string\"}\n",
        "LOCATION = \"global\"  # @param [\"global\", \"us\", \"eu\"] Global is preferred\n",
        "\n",
        "# GCS Parameters, e.g. 'gs://my_bucket/folder1/docs/'\n",
        "GCS_DIRECTORY_DOCS = \"\"  # @param {type:\"string\"}\n",
        "GCS_DIRECTORY_METADATA = \"\"  # @param {type:\"string\"}"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "sWkMcJej-2Gy"
      },
      "source": [
        "# Create VAIS App and Datastore"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "2sSInI07MCll"
      },
      "source": [
        "## [Prerequisite] Create a GCS bucket with sample documents\n",
        "\n",
        "This step is only needed for the purpose of this demo. For the real use case you will need to upload your actual documents to a GCS bucket.\n",
        "\n",
        "Here, we download Alphabet's 2022 Q1-Q4 Earning transcripts as sample documents."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Yg0Y_T_iLx_K"
      },
      "outputs": [],
      "source": [
        "def create_gcs_bucket_and_download_files(project_id, new_bucket_path, file_urls):\n",
        "    \"\"\"\n",
        "    Creates a new GCS bucket (if it doesn't exist) and downloads files from specified URLs.\n",
        "\n",
        "    Handles paths with subdirectories correctly using `urlparse`.\n",
        "    \"\"\"\n",
        "\n",
        "    if not new_bucket_path.startswith(\"gs://\") or not new_bucket_path.endswith(\"/\"):\n",
        "        raise ValueError(\n",
        "            \"Invalid GCS path format. Must start with 'gs://' and end with '/'. \"\n",
        "            f\"Received: '{new_bucket_path}'\"\n",
        "        )\n",
        "\n",
        "    storage_client = storage.Client(project=project_id)\n",
        "\n",
        "    # Extract bucket name and prefix from path\n",
        "    parsed_path = urlparse(new_bucket_path)\n",
        "    new_bucket_name = parsed_path.netloc\n",
        "    blob_prefix = parsed_path.path.strip(\"/\")  # Remove leading and trailing slashes\n",
        "\n",
        "    new_bucket = storage_client.bucket(new_bucket_name)\n",
        "\n",
        "    if not new_bucket.exists():\n",
        "        new_bucket = storage_client.create_bucket(new_bucket_name)\n",
        "        print(f\"Bucket {new_bucket_name} created.\")\n",
        "\n",
        "    for url in file_urls:\n",
        "        file_name = url.split(\"/\")[-1]\n",
        "        print(f\"Downloading: {file_name}\")\n",
        "\n",
        "        try:\n",
        "            response = requests.get(url)\n",
        "            response.raise_for_status()\n",
        "\n",
        "            # Construct the full blob path (including prefix)\n",
        "            blob_name = f\"{blob_prefix}/{file_name}\" if blob_prefix else file_name\n",
        "            blob = new_bucket.blob(blob_name)\n",
        "\n",
        "            blob.upload_from_string(response.content)\n",
        "            print(f\"Uploaded: {blob_name}\")  # Print the uploaded blob path\n",
        "        except requests.exceptions.RequestException as e:\n",
        "            print(f\"Error downloading {file_name}: {e}\")\n",
        "\n",
        "\n",
        "file_urls = [\n",
        "    \"https://abc.xyz/assets/investor/static/pdf/2022_Q1_Earnings_Transcript.pdf\",\n",
        "    \"https://abc.xyz/assets/investor/static/pdf/2022_Q2_Earnings_Transcript.pdf\",\n",
        "    \"https://abc.xyz/assets/investor/static/pdf/2022_Q3_Earnings_Transcript.pdf\",\n",
        "    \"https://abc.xyz/assets/investor/static/pdf/2022_Q4_Earnings_Transcript.pdf\",\n",
        "]\n",
        "\n",
        "create_gcs_bucket_and_download_files(PROJECT_ID, GCS_DIRECTORY_DOCS, file_urls)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "8qFNMjxJMuEr"
      },
      "source": [
        "## [Prerequisite] Create a GCS bucket with sample Metadata\n",
        "\n",
        "Similar to the code block above, this step is only needed for the purpose of this demo.\n",
        "\n",
        "Here we extract some trivial metadata from the file name. Each Metadata will have a content similar to the one below:\n",
        "\n",
        "```json\n",
        " {\n",
        "     \"doc_name\": \"2022_Q1_Earnings_Transcript\",\n",
        "     \"year\": \"2022\",\n",
        "     \"quarter\": \"Q1\",\n",
        "     \"doc_type\": \"earnings transcript\",\n",
        "     \"stock_tickers\": [\"GOOG\", \"GOOGL\"],\n",
        "     \"company_name\": \"alphabet\",\n",
        " }\n",
        " ```"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "xh_kZyI1MkpG"
      },
      "outputs": [],
      "source": [
        "def create_metadata_files(source_folder_path, metadata_folder_path):\n",
        "    \"\"\"Creates metadata JSON files for documents in a GCS folder.\"\"\"\n",
        "\n",
        "    if not metadata_folder_path.startswith(\n",
        "        \"gs://\"\n",
        "    ) or not metadata_folder_path.endswith(\"/\"):\n",
        "        raise ValueError(\n",
        "            \"Invalid GCS path format. Must start with 'gs://' and end with '/'. \"\n",
        "            f\"Received: '{metadata_folder_path}'\"\n",
        "        )\n",
        "\n",
        "    bucket_name = source_folder_path.split(\"/\")[2]\n",
        "    storage_client = storage.Client()\n",
        "    bucket = storage_client.bucket(bucket_name)\n",
        "\n",
        "    source_folder = source_folder_path.replace(f\"gs://{bucket_name}/\", \"\")\n",
        "    metadata_folder = metadata_folder_path.replace(f\"gs://{bucket_name}/\", \"\")\n",
        "\n",
        "    blobs = bucket.list_blobs(prefix=source_folder)\n",
        "\n",
        "    for blob in blobs:\n",
        "        # Explicitly check if the blob is a folder/directory\n",
        "        if blob.name.endswith(\"/\"):\n",
        "            print(f\"Skipping folder: {blob.name}\")\n",
        "            continue\n",
        "\n",
        "        # Get the filename by splitting on the last \"/\"\n",
        "        filename = blob.name.split(\"/\")[-1]\n",
        "\n",
        "        # Improved regex to match a wider variety of file names\n",
        "        doc_name_match = re.match(r\"(\\d{4})_Q(\\d)_\\w+_Transcript\\.pdf\", filename)\n",
        "        if not doc_name_match:\n",
        "            print(f\"Skipping file with unexpected name: {filename}\")\n",
        "            continue\n",
        "\n",
        "        year, quarter = doc_name_match.groups()\n",
        "\n",
        "        # Construct doc_type from the filename (without path)\n",
        "        doc_type = \"_\".join(filename.split(\"_\")[2:-1]).replace(\"_\", \" \")\n",
        "\n",
        "        metadata = {\n",
        "            \"doc_name\": filename.replace(\".pdf\", \"\"),\n",
        "            \"year\": year,\n",
        "            \"quarter\": f\"Q{quarter}\",\n",
        "            \"doc_type\": doc_type,\n",
        "            \"stock_tickers\": [\"GOOG\", \"GOOGL\"],\n",
        "            \"company_name\": \"alphabet\",\n",
        "        }\n",
        "\n",
        "        metadata_file_name = f\"{metadata['doc_name']}.txt\"\n",
        "        metadata_blob = bucket.blob(metadata_folder + metadata_file_name)\n",
        "\n",
        "        metadata_blob.upload_from_string(json.dumps(metadata, indent=4))\n",
        "\n",
        "        print(f\"Created metadata file: {metadata_blob.name}\")\n",
        "\n",
        "\n",
        "create_metadata_files(GCS_DIRECTORY_DOCS, GCS_DIRECTORY_METADATA)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "C2hXlewDINDg"
      },
      "source": [
        "## Helper functions to issue basic search on a Datastore or an App"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "v-XHQIOooshe"
      },
      "outputs": [],
      "source": [
        "def search_by_datastore(\n",
        "    project_id: str, location: str, datastore_id: str, query: str\n",
        ") -> dict[str, Any]:\n",
        "    \"\"\"Searches a datastore using the provided query.\"\"\"\n",
        "    response = authed_session.post(\n",
        "        f\"https://discoveryengine.googleapis.com/v1/projects/{project_id}/locations/{location}/collections/default_collection/dataStores/{datastore_id}/servingConfigs/default_search:search\",\n",
        "        headers={\n",
        "            \"Content-Type\": \"application/json\",\n",
        "        },\n",
        "        json={\"query\": query, \"pageSize\": 1},\n",
        "    )\n",
        "    return response\n",
        "\n",
        "\n",
        "def search_by_app(\n",
        "    project_id: str, location: str, app_id: str, query: str\n",
        ") -> dict[str, Any]:\n",
        "    \"\"\"Searches an app using the provided query.\"\"\"\n",
        "    response = authed_session.post(\n",
        "        f\"https://discoveryengine.googleapis.com/v1/projects/{project_id}/locations/{location}/collections/default_collection/engines/{app_id}/servingConfigs/default_config:search\",\n",
        "        headers={\n",
        "            \"Content-Type\": \"application/json\",\n",
        "        },\n",
        "        json={\"query\": query, \"pageSize\": 1},\n",
        "    )\n",
        "    return response"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "eAigF6KHkMZ2"
      },
      "source": [
        "## Helper functions to check whether or not a Datastore or an App already exist"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "IO1AxLZckXYK"
      },
      "outputs": [],
      "source": [
        "def datastore_exists(project_id: str, location: str, datastore_id: str) -> bool:\n",
        "    \"\"\"Check if a datastore exists.\"\"\"\n",
        "    response = search_by_datastore(project_id, location, datastore_id, \"test\")\n",
        "    status_code = response.status_code\n",
        "    if status_code == 200:\n",
        "        return True\n",
        "    if status_code == 404:\n",
        "        return False\n",
        "    raise Exception(f\"Error: {status_code}\")\n",
        "\n",
        "\n",
        "def app_exists(project_id: str, location: str, app_id: str) -> bool:\n",
        "    \"\"\"Check if an App exists.\"\"\"\n",
        "    response = search_by_app(project_id, location, app_id, \"test\")\n",
        "    status_code = response.status_code\n",
        "    if status_code == 200:\n",
        "        return True\n",
        "    if status_code == 404:\n",
        "        return False\n",
        "    raise Exception(f\"Error: {status_code}\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "dsUql1_wkeaO"
      },
      "source": [
        "## Helper functions to create a Datastore or an App\n",
        "\n",
        "The datastore is created with [Chunk Mode](https://cloud.google.com/generative-ai-app-builder/docs/parse-chunk-documents) and Chunk size of 500 tokens.\n",
        "\n",
        "The documents will be processed with Layout parser (higher quality for complex documents containing elements like tables and lists) and Ancestor information (i.e. headings) is included with each Chunk. Please see [official documentation](https://cloud.google.com/generative-ai-app-builder/docs/parse-chunk-documents) for more details.\n",
        "\n",
        "These settings are chosen to optimize accuracy, they can be adjusted in the create_datastore function below."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "SxR3hw5Tke-q"
      },
      "outputs": [],
      "source": [
        "def create_datastore(project_id: str, location: str, datastore_id: str) -> int:\n",
        "    \"\"\"Create a datastore.\"\"\"\n",
        "    payload = {\n",
        "        \"displayName\": datastore_id,\n",
        "        \"industryVertical\": \"GENERIC\",\n",
        "        \"solutionTypes\": [\"SOLUTION_TYPE_SEARCH\"],\n",
        "        \"contentConfig\": \"CONTENT_REQUIRED\",\n",
        "        \"documentProcessingConfig\": {\n",
        "            \"chunkingConfig\": {\n",
        "                \"layoutBasedChunkingConfig\": {\n",
        "                    \"chunkSize\": 500,\n",
        "                    \"includeAncestorHeadings\": True,\n",
        "                }\n",
        "            },\n",
        "            \"defaultParsingConfig\": {\"layoutParsingConfig\": {}},\n",
        "        },\n",
        "    }\n",
        "    header = {\"X-Goog-User-Project\": project_id, \"Content-Type\": \"application/json\"}\n",
        "    es_endpoint = f\"https://discoveryengine.googleapis.com/v1/projects/{project_id}/locations/{location}/collections/default_collection/dataStores?dataStoreId={datastore_id}\"\n",
        "    response = authed_session.post(\n",
        "        es_endpoint, data=json.dumps(payload), headers=header\n",
        "    )\n",
        "    if response.status_code == 200:\n",
        "        print(f\"The creation of Datastore {datastore_id} is initiated.\")\n",
        "        print(\"It may take a few minutes for the Datastore to become available\")\n",
        "    else:\n",
        "        print(f\"Failed to create Datastore {datastore_id}\")\n",
        "        print(response.json())\n",
        "    return response.status_code\n",
        "\n",
        "\n",
        "def create_app(project_id: str, location: str, datastore_id: str, app_id: str) -> int:\n",
        "    \"\"\"Create a search app.\"\"\"\n",
        "    payload = {\n",
        "        \"displayName\": app_id,\n",
        "        \"dataStoreIds\": [datastore_id],\n",
        "        \"solutionType\": \"SOLUTION_TYPE_SEARCH\",\n",
        "        \"searchEngineConfig\": {\n",
        "            \"searchTier\": \"SEARCH_TIER_ENTERPRISE\",\n",
        "            \"searchAddOns\": [\"SEARCH_ADD_ON_LLM\"],\n",
        "        },\n",
        "    }\n",
        "    header = {\"X-Goog-User-Project\": project_id, \"Content-Type\": \"application/json\"}\n",
        "    es_endpoint = f\"https://discoveryengine.googleapis.com/v1/projects/{project_id}/locations/{location}/collections/default_collection/engines?engineId={app_id}\"\n",
        "    response = authed_session.post(\n",
        "        es_endpoint, data=json.dumps(payload), headers=header\n",
        "    )\n",
        "    if response.status_code == 200:\n",
        "        print(f\"The creation of App {app_id}  is initiated.\")\n",
        "        print(\"It may take a few minutes for the App to become available\")\n",
        "    else:\n",
        "        print(f\"Failed to create App {app_id}\")\n",
        "        print(response.json())\n",
        "    return response.status_code"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "1hAp5cBnIYxJ"
      },
      "source": [
        "## Create a Datastore with the provided ID if it doesn't exist"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "hBUwJxxeAazj"
      },
      "outputs": [],
      "source": [
        "if datastore_exists(PROJECT_ID, LOCATION, DATASTORE_ID):\n",
        "    print(f\"Datastore {DATASTORE_ID} already exists.\")\n",
        "else:\n",
        "    create_datastore(PROJECT_ID, LOCATION, DATASTORE_ID)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "C1d-pd2WLJZI"
      },
      "source": [
        "## [Optional] Check if the Datastore is created successfully\n",
        "\n",
        "\n",
        "The Datastore is polled to track when it becomes available.\n",
        "\n",
        "This may take a few minutes"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "EZGzOCnTLOwf"
      },
      "outputs": [],
      "source": [
        "while not datastore_exists(PROJECT_ID, LOCATION, DATASTORE_ID):\n",
        "    print(f\"Datastore {DATASTORE_ID} is still being created.\")\n",
        "    time.sleep(30)\n",
        "print(f\"Datastore {DATASTORE_ID} is created successfully.\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "vSzz2AzmI5kx"
      },
      "source": [
        "## Create an App with the provided ID if it doesn't exist\n",
        "The App will be connected to a Datastore with the provided ID earlier in this notebook"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "4lp4kPXNm9sE"
      },
      "outputs": [],
      "source": [
        "if app_exists(PROJECT_ID, LOCATION, APP_ID):\n",
        "    print(f\"App {APP_ID} already exists.\")\n",
        "else:\n",
        "    create_app(PROJECT_ID, LOCATION, DATASTORE_ID, APP_ID)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "fxlTn7dVK-Q2"
      },
      "source": [
        "## [Optional] Check if the App is created successfully\n",
        "\n",
        "\n",
        "The App is polled to track when it becomes available.\n",
        "\n",
        "This may take a few minutes"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ZuQQ2HCGK4BA"
      },
      "outputs": [],
      "source": [
        "while not app_exists(PROJECT_ID, LOCATION, APP_ID):\n",
        "    print(f\"App {APP_ID} is still being created.\")\n",
        "    time.sleep(30)\n",
        "print(f\"App {APP_ID} is created successfully.\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "MQJ8In3r-2G0"
      },
      "source": [
        "# Providing your own schema for the Metadata"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "-7LR113gJg7B"
      },
      "source": [
        "## [Optional] Provide your own Schema\n",
        "\n",
        " The schema is detected automatically but it can be optionally adjusted to decide which fields should be:\n",
        "\n",
        " - Retrievable (returned in the response),\n",
        " - Searchable (searched through term-based and semantically),\n",
        " - Indexable (filtered, boosted etc)\n",
        "\n",
        "We can also specify keyProperties which gives special retrieval treatment to certain fields.\n",
        "\n",
        "Note that the Schema is only relevant to the Metadata and not the actual documents and it's hierarchical structure.\n",
        "\n",
        "See this documentation on [auto-detecting versus providing your own Schema](https://cloud.google.com/generative-ai-app-builder/docs/provide-schema)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "aO13rwu1Q6jr"
      },
      "outputs": [],
      "source": [
        "schema: dict[str, Any] = {\n",
        "    \"structSchema\": {\n",
        "        \"type\": \"object\",\n",
        "        \"properties\": {\n",
        "            \"doc_name\": {\n",
        "                \"keyPropertyMapping\": \"title\",\n",
        "                \"retrievable\": True,\n",
        "                \"dynamicFacetable\": False,\n",
        "                \"type\": \"string\",\n",
        "            },\n",
        "            \"year\": {\n",
        "                \"retrievable\": True,\n",
        "                \"indexable\": True,\n",
        "                \"dynamicFacetable\": False,\n",
        "                \"searchable\": False,\n",
        "                \"type\": \"string\",\n",
        "            },\n",
        "            \"quarter\": {\n",
        "                \"retrievable\": True,\n",
        "                \"indexable\": True,\n",
        "                \"dynamicFacetable\": False,\n",
        "                \"searchable\": False,\n",
        "                \"type\": \"string\",\n",
        "            },\n",
        "            \"doc_type\": {\n",
        "                \"retrievable\": True,\n",
        "                \"indexable\": True,\n",
        "                \"dynamicFacetable\": False,\n",
        "                \"searchable\": False,\n",
        "                \"type\": \"string\",\n",
        "            },\n",
        "            \"stock_tickers\": {\n",
        "                \"type\": \"array\",\n",
        "                \"items\": {\"type\": \"string\", \"keyPropertyMapping\": \"category\"},\n",
        "            },\n",
        "            \"company_name\": {\n",
        "                \"retrievable\": True,\n",
        "                \"indexable\": True,\n",
        "                \"dynamicFacetable\": False,\n",
        "                \"searchable\": False,\n",
        "                \"type\": \"string\",\n",
        "            },\n",
        "        },\n",
        "        \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n",
        "    }\n",
        "}\n",
        "\n",
        "response = authed_session.patch(\n",
        "    f\"https://discoveryengine.googleapis.com/v1/projects/{PROJECT_ID}/locations/{LOCATION}/collections/default_collection/dataStores/{DATASTORE_ID}/schemas/default_schema\",\n",
        "    headers={\n",
        "        \"Content-Type\": \"application/json\",\n",
        "    },\n",
        "    json=schema,\n",
        ")\n",
        "print(response.json())\n",
        "schema_update_lro = response.json()[\"name\"]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "EeuAlWHJKZ2t"
      },
      "source": [
        "## Check the status of Schema update\n",
        "\n",
        "For an empty Datastore the Schema update should be almost instantaneous.\n",
        "\n",
        "A request to update the schema creates a [Long-Running Operation](https://cloud.google.com/generative-ai-app-builder/docs/long-running-operations) which can be polled."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "om_a4O4NV-dU"
      },
      "outputs": [],
      "source": [
        "while True:\n",
        "    response = authed_session.get(\n",
        "        f\"https://discoveryengine.googleapis.com/v1/{schema_update_lro}\",\n",
        "    )\n",
        "    try:\n",
        "        status = response.json()[\"done\"]\n",
        "        if status:\n",
        "            print(\"Import completed!\")\n",
        "            break\n",
        "    except:\n",
        "        print(\"Import in progress.\")\n",
        "        time.sleep(10)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Mr3n2jbaLoo5"
      },
      "source": [
        "## [Optional] Get the current Schema\n",
        "This block can be used to check whether or not the schema is in the desired state (particularly useful for an auto-detected schema)."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Ueel8I0NS66p"
      },
      "outputs": [],
      "source": [
        "resp = authed_session.get(\n",
        "    f\"https://discoveryengine.googleapis.com/v1/projects/{PROJECT_ID}/locations/{LOCATION}/collections/default_collection/dataStores/{DATASTORE_ID}/schemas/default_schema\",\n",
        ")\n",
        "resp.json()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "lh3TvqnNqQoP"
      },
      "source": [
        "# Prepare documents with metadata for ingestion"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "xoq4nUxyPuF5"
      },
      "source": [
        "## Define the path to documents and Metadata (both in GCS and Local)\n",
        "The JSONL GCS Directory will be used to store the JSONL file to-be-cereated. If such a directory does not exist, it will be created.\n",
        "\n",
        "For the purpose of this demo, the documents and their correponding metadata are joined based on the FIELD_FOR_FILE_NAME within the metadata (doc_name in this example)\n",
        "\n",
        "Based on that convention, the metadata for \"2022_Q1_Earnings_Transcript.pdf\" will have the following content:\n",
        "\n",
        "```json\n",
        " {\n",
        "     \"doc_name\": \"2022_Q1_Earnings_Transcript\",\n",
        "     \"year\": \"2022\",\n",
        "     \"quarter\": \"Q1\",\n",
        "     \"doc_type\": \"earnings transcript\",\n",
        "     \"stock_tickers\": [\"GOOG\", \"GOOGL\"],\n",
        "     \"company_name\": \"alphabet\",\n",
        " }\n",
        " ```\n",
        "\n",
        "The logic is applied for illustration purposes and you can apply any other joining logic that fits your data (e.g. common name between metadata and document files)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "s8YpjxkrprRg"
      },
      "outputs": [],
      "source": [
        "DOCUMENT_FORMAT = \"pdf\"  # @param [\"docx\", \"pdf\"]\n",
        "GCS_DIRECTORY_JSONL = \"\"  # @param {type:\"string\"}\n",
        "FIELD_FOR_FILE_NAME = \"doc_name\"  # @param {type:\"string\"}\n",
        "\n",
        "JSONL_FILENAME = \"alphabet_earnings.json\"\n",
        "LOCAL_DOCS_PATH = \"data\"\n",
        "LOCAL_METADATA_PATH = \"metadata\"\n",
        "LOCAL_JSONL_PATH = \"jsonl\""
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "_KGV_xyI1fxh"
      },
      "source": [
        "## Helper function to prepare JSONL content\n",
        "A JSONL file needs to be created which contains a joined list of documents to be ingested and their metadata. You can find more details on the expected formatting [here](https://cloud.google.com/generative-ai-app-builder/docs/prepare-data#storage-unstructured)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "qwpDRUWK1gEY"
      },
      "outputs": [],
      "source": [
        "def prepare_jsonl(row: pd.Series) -> dict[str, Any]:\n",
        "    \"\"\"Prepares metadata for a given row in the DataFrame.\"\"\"\n",
        "    mimetype = (\n",
        "        \"application/vnd.openxmlformats-officedocument.wordprocessingml.document\"\n",
        "        if DOCUMENT_FORMAT == \"docx\"\n",
        "        else \"application/pdf\"\n",
        "    )\n",
        "    struct_data = row.to_dict()\n",
        "    return {\n",
        "        \"id\": row[FIELD_FOR_FILE_NAME],\n",
        "        \"structData\": struct_data,\n",
        "        \"content\": {\n",
        "            \"mimeType\": mimetype,\n",
        "            \"uri\": f\"{GCS_DIRECTORY_DOCS}{row[FIELD_FOR_FILE_NAME]}.{DOCUMENT_FORMAT}\",\n",
        "        },\n",
        "    }"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ZrJHYvSO11EG"
      },
      "source": [
        "## Prepare JSONL file and save to GCS\n",
        "Documents and their metadata are copied to the local path, loaded in a DataFrame, and processed to prepare a JSONL file with the expected format\n",
        "The JSONL file is then uploaded the provided GCS path"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "t8i9eB7G10ul"
      },
      "outputs": [],
      "source": [
        "# Copy files from GCS to local\n",
        "os.makedirs(LOCAL_DOCS_PATH, exist_ok=True)\n",
        "os.makedirs(LOCAL_METADATA_PATH, exist_ok=True)\n",
        "os.makedirs(LOCAL_JSONL_PATH, exist_ok=True)\n",
        "!gsutil -m cp -r {GCS_DIRECTORY_DOCS}* {LOCAL_DOCS_PATH}\n",
        "!gsutil -m cp -r {GCS_DIRECTORY_METADATA}* {LOCAL_METADATA_PATH}\n",
        "\n",
        "# Load and process metadata\n",
        "metadata_files = glob.glob(f\"{os.getcwd()}/{LOCAL_METADATA_PATH}/*.txt\")\n",
        "df_json = pd.concat(\n",
        "    [pd.read_json(file, typ=\"series\") for file in metadata_files], axis=1\n",
        ").T  # Load all JSON into one DataFrame\n",
        "\n",
        "# Apply metadata preparation and save as JSONL\n",
        "df_json[\"metadata\"] = df_json.apply(prepare_jsonl, axis=1)\n",
        "df_json[\"metadata\"].to_json(\n",
        "    f\"{LOCAL_JSONL_PATH}/{JSONL_FILENAME}\", orient=\"records\", lines=True\n",
        ")\n",
        "\n",
        "# Upload the local JSONL file to GCS\n",
        "!gsutil -m cp {LOCAL_JSONL_PATH}/* {GCS_DIRECTORY_JSONL}\n",
        "\n",
        "# Optional print of the jsonL content\n",
        "print(\"\\nJSONL Content:\")\n",
        "for metadata_entry in df_json[\"metadata\"]:\n",
        "    print(json.dumps(metadata_entry, indent=2))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "J4PuuT-jqdqZ"
      },
      "source": [
        "# Ingest documents to Datastore"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "x7cW0g5t2DSk"
      },
      "source": [
        "## Import documents with metadata from JSONL on GCS\n",
        "This is where the actual import to the Datastore happens.\n",
        "The process is done Async, and the request returns an instance of a \"Long running Operation\"\n",
        "\n",
        "This may take xx minutes. Feel free to grab a coffee."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "-FVcu7wGJcom"
      },
      "outputs": [],
      "source": [
        "def import_documents_from_gcs_jsonl(\n",
        "    project_id: str, location: str, datastore_id: str, gcs_uri: str\n",
        ") -> str:\n",
        "    \"\"\"Imports documents from a JSONL file in GCS.\"\"\"\n",
        "    payload = {\n",
        "        \"reconciliationMode\": \"INCREMENTAL\",\n",
        "        \"gcsSource\": {\"inputUris\": [gcs_uri]},\n",
        "    }\n",
        "    header = {\"Content-Type\": \"application/json\"}\n",
        "    es_endpoint = f\"https://discoveryengine.googleapis.com/v1/projects/{project_id}/locations/{location}/collections/default_collection/dataStores/{datastore_id}/branches/default_branch/documents:import\"\n",
        "    response = authed_session.post(\n",
        "        es_endpoint, data=json.dumps(payload), headers=header\n",
        "    )\n",
        "    print(f\"--{response.json()}\")\n",
        "    return response.json()[\"name\"]\n",
        "\n",
        "\n",
        "import_lro = import_documents_from_gcs_jsonl(\n",
        "    project_id=PROJECT_ID,\n",
        "    location=LOCATION,\n",
        "    datastore_id=DATASTORE_ID,\n",
        "    gcs_uri=f\"{GCS_DIRECTORY_JSONL}{JSONL_FILENAME}\",\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "vttgvfVB2M-H"
      },
      "source": [
        "## [Optional] Check the status of document import via polling\n",
        "Optionally check the status of the long running operation for the import job. You can check this in the UI as well by looking at the \"activity\" tab of the corresponding Datastore"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "rYq_P_hDnSrh"
      },
      "outputs": [],
      "source": [
        "while True:\n",
        "    response = authed_session.get(\n",
        "        f\"https://discoveryengine.googleapis.com/v1/{import_lro}\",\n",
        "    )\n",
        "    try:\n",
        "        status = response.json()[\"done\"]\n",
        "        if status:\n",
        "            print(\"Import completed!\")\n",
        "            break\n",
        "    except KeyError:\n",
        "        print(\"Import in progress.\")\n",
        "        time.sleep(60)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "e-qjlyc_qk3U"
      },
      "source": [
        "# Run queries with and without Metadata filter"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "cXo1n5dmQT4N"
      },
      "source": [
        "## Sample search without filter\n",
        "A basic search request issued to the Datastore\n",
        "\n",
        "We get relevant results from all four documents in the datastore"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Pj3QBnCjQUVT"
      },
      "outputs": [],
      "source": [
        "test_query = \"Google revenue\"\n",
        "\n",
        "response = authed_session.post(\n",
        "    f\"https://discoveryengine.googleapis.com/v1alpha/projects/{PROJECT_ID}/locations/{LOCATION}/collections/default_collection/dataStores/{DATASTORE_ID}/servingConfigs/default_search:search\",\n",
        "    headers={\n",
        "        \"Content-Type\": \"application/json\",\n",
        "    },\n",
        "    json={\n",
        "        \"query\": test_query,\n",
        "    },\n",
        ")\n",
        "response.json()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "xn7DSXTD2XCT"
      },
      "source": [
        "## Sample search with filter\n",
        "\n",
        "Now let's apply a filter to only show results relevant to Q2.\n",
        "\n",
        "You can see that now we only get results from a single document in the corpus which matches the filter.\n",
        "\n",
        "Note that this block shows a very basic way of querying a Datastore. You can find more information [here](https://cloud.google.com/generative-ai-app-builder/docs/preview-search-results)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "S9vz8canXd1r"
      },
      "outputs": [],
      "source": [
        "test_query = \"Google revenue\"\n",
        "\n",
        "response = authed_session.post(\n",
        "    f\"https://discoveryengine.googleapis.com/v1alpha/projects/{PROJECT_ID}/locations/{LOCATION}/collections/default_collection/dataStores/{DATASTORE_ID}/servingConfigs/default_search:search\",\n",
        "    headers={\n",
        "        \"Content-Type\": \"application/json\",\n",
        "    },\n",
        "    json={\n",
        "        \"query\": test_query,\n",
        "        \"filter\": 'quarter: ANY(\"Q2\")',\n",
        "    },\n",
        ")\n",
        "response.json()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "F_t_Afjbq1Ow"
      },
      "source": [
        "# Cleanup\n",
        "Clean up resources created in this notebook.\n",
        "\n",
        "## Clean up GCS bucket\n",
        "\n",
        "❗❗❗ Only run the below cells if you created a new bucket just for this notebook ❗❗❗\n",
        "\n",
        "Technically you could have used different buckets for documents, their Metadata and JSONL. If you happened to use the same **TEST** bucket for all of them, the following cells help you do the cleanup.\n",
        "\n",
        "To confirm the assumption above, you're asked to explicitly enter the Bucket name."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "n-SHsaMcu3YV"
      },
      "outputs": [],
      "source": [
        "def empty_bucket(bucket_name):\n",
        "    \"\"\"Deletes all objects in the specified GCS bucket.\"\"\"\n",
        "    client = storage.Client()\n",
        "    bucket = client.get_bucket(bucket_name)\n",
        "\n",
        "    blobs = bucket.list_blobs()  # List all blobs (objects)\n",
        "    for blob in blobs:\n",
        "        blob.delete()  # Delete each blob\n",
        "\n",
        "    print(f\"Bucket {bucket_name} emptied.\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Uif1acXlwozv"
      },
      "outputs": [],
      "source": [
        "# Name of the bucket to be deleted. e.g. \"my_bucket\"\n",
        "BUCKET_TO_DELETE = \"\"  # @param {type:\"string\"}\n",
        "\n",
        "# Empty the bucket by deleting all files in it\n",
        "empty_bucket(BUCKET_TO_DELETE)\n",
        "\n",
        "# Create a client object\n",
        "client = storage.Client(project=PROJECT_ID)\n",
        "\n",
        "# Get the bucket object\n",
        "bucket = client.get_bucket(BUCKET_TO_DELETE)\n",
        "\n",
        "# Delete the bucket\n",
        "bucket.delete()\n",
        "\n",
        "print(f\"Bucket {BUCKET_TO_DELETE} deleted successfully.\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "T7xp_ujTxqLu"
      },
      "source": [
        "## Delete local files\n",
        "This will delete local folders for Documents, Metadata, and JSONL according to paths specified earlier in this notebook."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "rr70hhfExoPW"
      },
      "outputs": [],
      "source": [
        "shutil.rmtree(LOCAL_DOCS_PATH)\n",
        "shutil.rmtree(LOCAL_METADATA_PATH)\n",
        "shutil.rmtree(LOCAL_JSONL_PATH)\n",
        "\n",
        "print(\"Local files deleted successfully.\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "tGuk4ZnJk0S7"
      },
      "source": [
        "## Delete the Search App\n",
        "\n",
        "Delete the App if you no longer need it\n",
        "\n",
        "Alternatively you can follow [these instructions](https://console.cloud.google.com/gen-app-builder/data-stores) to delete an App from the UI\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "QEfxXtzfk0rx"
      },
      "outputs": [],
      "source": [
        "response = authed_session.delete(\n",
        "    f\"https://discoveryengine.googleapis.com/v1alpha/projects/{PROJECT_ID}/locations/{LOCATION}/collections/default_collection/engines/{APP_ID}\",\n",
        "    headers={\"X-Goog-User-Project\": PROJECT_ID},\n",
        ")\n",
        "\n",
        "print(response.text)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Tgm5idL4DjjU"
      },
      "source": [
        "## Delete the Datastores\n",
        "Delete the Datastore if you no longer need it\n",
        "\n",
        "Alternatively you can follow [these instructions](https://console.cloud.google.com/gen-app-builder/data-stores) to delete a Datastore from the UI"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "vj8BpuS62tgt"
      },
      "outputs": [],
      "source": [
        "response = authed_session.delete(\n",
        "    f\"https://discoveryengine.googleapis.com/v1alpha/projects/{PROJECT_ID}/locations/{LOCATION}/collections/default_collection/dataStores/{DATASTORE_ID}\",\n",
        "    headers={\"X-Goog-User-Project\": PROJECT_ID},\n",
        ")\n",
        "\n",
        "print(response.text)"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "name": "ingesting_unstructured_documents_with_metadata.ipynb",
      "toc_visible": true
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
